# ------------------------------------------------------------------------------
# Builds primary diagnoses for full cohort
# Author: Cassidy Shubatt <cshubatt@gmail.com>
# To run: bash 08_primary_diagnoses.sh
# ------------------------------------------------------------------------------

# Libraries --------------------------------------------------------------------
library(here)
library(yaml)
library(data.table)
library(tidyverse)
library(feather) # read_feather

u <- modules::use(here::here("lib", "util.R"))

# Load Data --------------------------------------------------------------------
message("Loading data...")
overnight_lab <- ""
paths <- read_yaml(here::here("lib", "filepaths.yml"))

# visit ID isn't exactly the same in ed enc xwalk and ed dia xwalk,
# have to join on ptid and start_date which is imperfect
ed_enc_xwalk_raw <- read_feather(paths$cohort$ed_enc_xwalk) %>%
  select(ptid, enc_row_id, visit_id, start_date) %>%
  setnames("enc_row_id", "ed_enc_id")

primary_dia_raw <- read_feather(paths$analysis$ed_dia) %>%
  filter(primary_flag == "1") %>%
  mutate(ptid = as.numeric(ptid)) %>%
  setnames("dia_date", "start_date") %>%
  select(ptid, visit_id, start_date, dia_code, zc_cat_name)

# Merge ------------------------------------------------------------------------
message("Preparing to merge encs with primary diagnoses...")
# get duplicated ptid-date pairs
dup <- duplicated(ed_enc_xwalk_raw %>% select(ptid, start_date))
dup_df <- ed_enc_xwalk_raw[dup, ] %>%
  select(ptid, start_date) %>%
  mutate(dup_ptid_date = TRUE) %>%
  unique()

ed_enc_xwalk <- ed_enc_xwalk_raw %>%
  u$safe_left_join(dup_df) %>%
  mutate(dup_ptid_date = replace_na(dup_ptid_date, FALSE))
primary_dia <- primary_dia_raw %>%
  u$safe_left_join(dup_df) %>%
  mutate(dup_ptid_date = replace_na(dup_ptid_date, FALSE)) %>%
  mutate(primary_row_id = seq.int(nrow(.)))

matched_enc_dia <- tibble(
  ed_enc_id = double(), zc_cat_name = character(), dia_code = character(),
  primary_row_id = integer()
)

# Non-duplicated ---------------------------------------------------------------
message("Merging nonduplicated ptid-date pairs on ptid-date...")
primary_dia_nondup <- primary_dia %>%
  filter(!dup_ptid_date)
ed_enc_nondup <- filter(ed_enc_xwalk, !dup_ptid_date) %>%
  select(-visit_id) %>%
  u$safe_left_join(primary_dia_nondup) %>%
  select(ed_enc_id, zc_cat_name, dia_code, primary_row_id) %>%
  filter(!is.na(zc_cat_name))
matched_enc_dia <- matched_enc_dia %>%
  rbind(ed_enc_nondup)
message(
  nrow(ed_enc_nondup),
  " encounters successfully matched using unique ptid-date pairs"
)

# Duplicated using visit_id ----------------------------------------------------
message("Merging duplicated ptid-date pairs using visit_id...")
primary_dia_dup <- primary_dia %>%
  filter(dup_ptid_date)
ed_enc_dup <- ed_enc_xwalk %>%
  filter(dup_ptid_date) %>%
  u$safe_left_join(primary_dia_dup) %>%
  filter(!is.na(zc_cat_name)) %>%
  select(ed_enc_id, zc_cat_name, dia_code, primary_row_id)
matched_enc_dia <- matched_enc_dia %>%
  rbind(ed_enc_dup)
message(
  nrow(ed_enc_dup),
  " encounters successfully matched using ptid, date, and visit_id"
)

# Duplicated using first ptid-date match ---------------------------------------
message("Merging duplicated ptid-date pairs using first match...")
unmatched_dia <- primary_dia %>%
  filter(!(primary_row_id %in% matched_enc_dia$primary_row_id)) %>%
  group_by(ptid, start_date) %>%
  summarize(
    primary_row_id = primary_row_id[1],
    dia_code = dia_code[1],
    zc_cat_name = zc_cat_name[1]
  ) %>%
  ungroup()
unmatched_encs <- ed_enc_xwalk %>%
  filter(!(ed_enc_id %in% matched_enc_dia$ed_enc_id)) %>%
  u$safe_left_join(unmatched_dia) %>%
  filter(!is.na(zc_cat_name)) %>%
  select(ed_enc_id, zc_cat_name, dia_code, primary_row_id)

matched_enc_dia <- matched_enc_dia %>%
  rbind(unmatched_encs) %>%
  setnames("dia_code", "primary_dx_code")

message(
  nrow(unmatched_encs),
  " encounters successfully matched using first ptid-date match in dia xwalk"
)
message(
  "Successfully matched ", nrow(matched_enc_dia),
  " ED Encounters to primary diagnosis"
)

# Save -------------------------------------------------------------------------
message("Saving primary diagnosis xwalk to ", paths$analysis$primary_dx, "...")
write_rds(matched_enc_dia, paths$analysis$primary_dx)

message("Done.")
